load("02_15_m5s_forum_apr15_author_omega.RData")
load("02_15_m5s_forum_apr15_comment.RData")
load("02_15_m5s_forum_apr15_thread.RData")

forum_text <- data.frame(author_unique_id = c(thread$authorUrl[], comment$authorUrl[]),
                         type = c(rep("thread",nrow(thread)),rep("comment",nrow(comment))),
                         text = c(thread$message[], comment$message[]),
                         text_id = c(thread$threadId, comment$commentId),
                         date = c(as.numeric(as.POSIXct(thread$createdAt[], format="%Y-%m-%dT%H:%M:%S")), 
                                  as.numeric(as.POSIXct(comment$createdAt[], format="%Y-%m-%dT%H:%M:%S"))),
                         stringsAsFactors = FALSE)

# Remove authors can't be uniquely identify
forum_text <- subset(forum_text, 
                     grepl("http://www.beppegrillo.it/", author_unique_id) | 
                       grepl("NEWFK", author_unique_id))

cleanText <- function(string){
  string <- gsub("<.*?>", " ", string)
  string <- gsub("\\\\r|\\\\n|\\n|\\r", " ", string)
  string <- gsub("[[:punct:]]+"," ", string)
  return(string)
}

forum_text$text <- cleanText(forum_text$text)

countToken <- function (string) {
  require(tm)
  length(scan_tokenizer(string))
}

forum_text$n_words <- sapply(forum_text$text, countToken, USE.NAMES = FALSE)

save(forum_text, file="02_17_m5s_forum_text_cleaned_unique.RData")

summary(forum_text$n_words)
with(forum_text, {
  plot(density(log10(n_words), adjust = 5))
})

require(dplyr)
forum_activity_author_attributes_01 <-
  forum_text %>%
  group_by(author_unique_id) %>%
  summarise(thread_posts = sum(type == "thread"),
            comment_posts = sum(type == "comment"),
            first_post = min(date),
            last_post = max(date),
            mean_length = mean(n_words),
            mean_length_thread = mean(n_words[type == "thread"]),
            mean_length_comment = mean(n_words[type == "comment"]))

forum_activity_author_attributes_01$permanence_days <- with(forum_activity_author_attributes_01,
                                                            (last_post - first_post) / (3600 * 24))
forum_activity_author_attributes_01$frequency_post_day <- with(forum_activity_author_attributes_01,
                                                            (thread_posts + comment_posts) / permanence_days)

summary(forum_activity_author_attributes_01$permanence_days, na.rm=TRUE)
sum(forum_activity_author_attributes_01$permanence_days < 1) / nrow(forum_activity_author_attributes_01)

# Lexical variety
stemCorpus <- function(vec) {
  require(SnowballC)
  corpus <- strsplit(tolower(paste(vec, collapse = " ")), " ")[[1]]
  corpus <- wordStem(corpus, language = "italian")
  corpus <- corpus[corpus!=""]
  return(corpus)
}

doMATTR <- function(vec) {
  require(koRpus)
  tok <- koRpus::tokenize(tolower(paste(vec, collapse = " ")), format = "obj", tag = TRUE, lang = 'it')
  return(koRpus::MATTR(tok))
}

lex_var_df <-
  forum_text %>%
  group_by(author_unique_id) %>%
  summarize(corpus = list(stemCorpus(text)))


lex_var_df$corpus_len_tot <- sapply(lex_var_df$corpus, function(x) length(unlist(x)))
lex_var_df$corpus_len_unique <-  sapply(lex_var_df$corpus, function(x) length(unique(unlist(x))))
lex_var_df$corpus_lex_diversity <- with(lex_var_df, corpus_len_unique / corpus_len_tot)
lex_var_df$corpus_lex_diversity_log <- NA
lex_var_df$corpus_lex_diversity_log[lex_var_df$corpus_len_tot>1] <- 
  with(lex_var_df, corpus_len_unique[corpus_len_tot>1] / 
         log(corpus_len_tot[corpus_len_tot>1]))



with(lex_var_df, cor.test(corpus_len_tot, corpus_lex_diversity_log))

summary(lex_var_df$corpus_len_tot)
summary(lex_var_df$corpus_len_unique)
summary(with(lex_var_df, corpus_len_unique / corpus_len_tot))
plot(density(with(lex_var_df, corpus_lex_diversity[corpus_lex_diversity<1]),  na.rm=TRUE))

forum_activity_author_attributes_01 <-
  merge(forum_activity_author_attributes_01, lex_var_df[,c("author_unique_id",
                                                           "corpus_len_tot",
                                                           "corpus_len_unique",
                                                           "corpus_lex_diversity")])

forum_activity_author_attributes_01$tot_posts <- with(forum_activity_author_attributes_01,
                                                      thread_posts + comment_posts)

forum_activity_author_attributes_01 <- 
  merge(forum_activity_author_attributes_01, 
        author_omega,
        by.x="author_unique_id",
        by.y="id", 
        all.x=TRUE)
forum_activity_author_attributes_01$timestamp <- NULL
save(forum_activity_author_attributes_01, file="02_17_m5s_forum_author_activity_attributes_01.RData")
save(forum_text, file="02_17_m5s_forum_text_cleaned_unique.RData")
